NEON_metagenomes <- read_tsv("data/NEON/exported_img_data_Gs0161344_NEON.tsv") %>%
select(-c(`Domain`, `Sequencing Status`, `Sequencing Center`)) %>%
rename(`Genome Name` = `Genome Name / Sample Name`) %>%
filter(str_detect(`Genome Name`, 're-annotation', negate = T)) %>%
filter(str_detect(`Genome Name`, 'WREF plot', negate = T))
NEON_metagenomes <- NEON_metagenomes %>%
# Get rid of the the common string "Soil microbial communities from "
mutate_at("Genome Name", str_replace, "Terrestrial soil microbial communities from ", "") %>%
# Use the first `-` to split the column in two
separate(`Genome Name`, c("Site","Sample Name"), " - ") %>%
# Get rid of the the common string "-comp-1"
mutate_at("Sample Name", str_replace, "-comp-1", "") %>%
# separate the Sample Name into Site ID and plot info
separate(`Sample Name`, c("Site ID","subplot.layer.date"), "_", remove = FALSE,) %>%
# separate the plot info into 3 columns
separate(`subplot.layer.date`, c("Subplot", "Layer", "Date"), "-")
NEON_chemistry <- read_tsv("data/NEON/neon_plot_soilChem1_metadata.tsv") %>%
# remove -COMP from genomicsSampleID
mutate_at("genomicsSampleID", str_replace, "-COMP", "")
NEON_MAGs_metagenomes_chemistry <- NEON_MAGs %>%
left_join(NEON_metagenomes, by = "Sample Name") %>%
left_join(NEON_chemistry, by = c("Sample Name" = "genomicsSampleID")) %>%
rename("label" = "Bin ID")
tree_arc <- read.tree("data/NEON/gtdbtk.ar53.decorated.tree")
tree_bac <- read.tree("data/NEON/gtdbtk.bac120.decorated.tree")
#Gammaproteobacteria
NEON_MAGs_metagenomes_chemistry_Gammaproteobacteria <- NEON_MAGs_metagenomes_chemistry %>%
filter(str_detect(`Class`,"Gammaproteobacteria"))
#Steroidobacterales
NEON_MAGs_metagenomes_chemistry_Steroidobacterales<- NEON_MAGs_metagenomes_chemistry %>%
filter(str_detect(`Order`,"Steroidobacterales"))
#Burkholderiales
NEON_MAGs_metagenomes_chemistry_Burkholderiales <- NEON_MAGs_metagenomes_chemistry %>%
filter(str_detect(`Order`,"Burkholderiales"))
#Novel
NEON_MAGs_metagenomes_chemistry_Gammaproteobacteria_Novel <- NEON_MAGs_metagenomes_chemistry_Gammaproteobacteria %>%
filter(is.na(Order) | is.na(Family) | is.na(Genus) | is.na(Species))
#almost are novel only two have species names
#Toolik
NEON_MAGs_metagenomes_chemistry_TOOL<- NEON_MAGs_metagenomes_chemistry %>%
filter(`Site ID.x` == "TOOL") %>%
mutate(`Genome Size (Kbp)`=as.integer(`Total Number of Bases`/1000))
NEON_MAGs_metagenomes_chemistry_TOOL_Novel <- NEON_MAGs_metagenomes_chemistry_TOOL %>%
filter(is.na(Order) | is.na(Family) | is.na(Genus) | is.na(Species))
#almost are novel only two have species names
TOOL_MAGs_label <- NEON_MAGs_metagenomes_chemistry_TOOL$label
tree_bac_TOOL_MAGs <-drop.tip(tree_bac,tree_bac$tip.label[-match(TOOL_MAGs_label, tree_bac$tip.label)])
# Make a vector with the internal node lables
node_vector_bac_TOOL_MAGS = c(tree_bac_TOOL_MAGs$tip.label,tree_bac_TOOL_MAGs$node.label)
NEON_MAGs_metagenomes_chemistry_Gamma_noblank <- NEON_MAGs_metagenomes_chemistry_Gammaproteobacteria %>%
rename("Orders" = "Order") %>%
rename("Phyla" = "Phylum") %>%
rename("AssemblyType" = "Assembly Type") %>%
rename("WaterpH" ="soilInWaterpH") %>%
rename("Temp" ="soilTemp") %>%
rename("BinCompleteness" = "Bin Completeness") %>%
rename("BinContamination" = "Bin Contamination") %>%
rename("TotalNumberofBases" = "Total Number of Bases") %>%
rename("EcosystemSubtype" = "Ecosystem Subtype") %>%
rename("GeneCount" = "Gene Count") %>%
rename("GCassembled" = "GC * assembled")
NEON_MAGs_metagenomes_chemistry_TOOL_noblank <- NEON_MAGs_metagenomes_chemistry_TOOL %>%
rename("Phyla" = "Phylum") %>%
rename("AssemblyType" = "Assembly Type") %>%
rename("WaterpH" ="soilInWaterpH") %>%
rename("Temp" ="soilTemp") %>%
rename("BinCompleteness" = "Bin Completeness") %>%
rename("BinContamination" = "Bin Contamination") %>%
rename("TotalNumberofBases" = "Total Number of Bases") %>%
rename("EcosystemSubtype" = "Ecosystem Subtype") %>%
rename("GeneCount" = "Gene Count") %>%
rename("GCassembled" = "GC * assembled")
NEON_MAGs_bact_ind <- NEON_MAGs %>%
filter(Domain=="Bacteria") %>%
filter(`Assembly Type`=="Individual")
NEON_MAGs_bact_ind_Novel <- NEON_MAGs_bact_ind %>%
filter(is.na(Class) | is.na(Order) | is.na(Family) | is.na(Genus) )
NEON_MAGs_metagenomes_chemistry_TOOL_Xanthomonadales <- NEON_MAGs_metagenomes_chemistry_TOOL %>%
filter(str_detect(`Order`,"Xanthomonadales"))
NEON_MAGs_metagenomes_chemistry_TOOL_Burkholderiales <- NEON_MAGs_metagenomes_chemistry_TOOL %>%
filter(str_detect(`Order`,"Burkholderiales"))
NEON_MAGs_metagenomes_chemistry_TOOL_Steroidobacterales <- NEON_MAGs_metagenomes_chemistry_TOOL %>%
filter(str_detect(`Order`,"Steroidobacterales"))
NEON_MAGs_metagenomes_chemistry_Alaska <-NEON_MAGs_metagenomes_chemistry %>%
filter(str_detect(Site.x,"Alaska"))
# Make a vector with the internal node labels
node_vector_bac = c(tree_bac$tip.label,tree_bac$node.label)
# Search for your Phylum, dont sort differently it will mess up nodes
#NEON
phylumss <-NEON_MAGs_metagenomes_chemistry %>%
count(Phylum, sort=TRUE)
n=1
while (n!=29) {
if (length(grep(phylumss[n,1], node_vector_bac, value = TRUE))==2) {
phylumss[n,3] <-match(grep(phylumss[n,1], node_vector_bac, value = TRUE), node_vector_bac)[2]
}
else {
phylumss[n,3] <-match(grep(phylumss[n,1], node_vector_bac, value = TRUE), node_vector_bac)[1]
}
n=n+1
}
# for some reason they didnt name phylum subpopulations the same way for each, so we have to correct for Desulfobacterota
# grep("Desulfobacterota", node_vector_bac, value = TRUE)
# match(grep("Desulfobacterota", node_vector_bac, value = TRUE), node_vector_bac)
# match(grep("Desulfobacterota_B", node_vector_bac, value = TRUE), node_vector_bac)
phylumss[16,3] <- match(grep(phylumss[16,1], node_vector_bac, value = TRUE), node_vector_bac)[1]
phylumss <-phylumss %>%
arrange(desc(`...3`))
colortest <-viridis(29)
n=1
while (n!=29) {
phylumss[n,4] <- colortest[n]
n=n+1
}
#Gamma
tree_bac_node_Gammaproteobacteria <- Preorder(tree_bac)
tree_Gammaproteobacteria <- Subtree(tree_bac_node_Gammaproteobacteria, 3048)
# grep("Thermoproteota", node_vector_bac, value = TRUE)
# match(grep("Thermoproteota", node_vector_bac, value = TRUE), node_vector_bac)
#
# grep("Actinomycetota", node_vector_bac, value = TRUE)
# match(grep("Actinomycetota", node_vector_bac, value = TRUE), node_vector_bac)
#
# grep("Desulfobacterota", node_vector_bac, value = TRUE)
# match(grep("Desulfobacterota", node_vector_bac, value = TRUE), node_vector_bac)
# match(grep("Desulfobacterota_B", node_vector_bac, value = TRUE), node_vector_bac)
#
# grep("Bacteroidota", node_vector_bac, value = TRUE)
# match(grep("Bacteroidota", node_vector_bac, value = TRUE), node_vector_bac)
#
# grep("Verrucomicrobiota", node_vector_bac, value = TRUE)
# match(grep("Verrucomicrobiota", node_vector_bac, value = TRUE), node_vector_bac)
#
# grep("Chloroflexota", node_vector_bac, value = TRUE)
# match(grep("Chloroflexota", node_vector_bac, value = TRUE), node_vector_bac)
#
# grep("Eremiobacterota", node_vector_bac, value = TRUE)
# match(grep("Eremiobacterota", node_vector_bac, value = TRUE), node_vector_bac)
#
# grep("Patescibacteria", node_vector_bac, value = TRUE)
# match(grep("Patescibacteria", node_vector_bac, value = TRUE), node_vector_bac)
#
# grep("Pseudomonadota", node_vector_bac, value = TRUE)
# match(grep("Pseudomonadota", node_vector_bac, value = TRUE), node_vector_bac)
# grep("Phycisphaerae", node_vector_bac_TOOL_MAGS, value = TRUE)
# match(grep("Phycisphaerae", node_vector_bac_TOOL_MAGS, value = TRUE), node_vector_bac_TOOL_MAGS)
#
# grep("Acidobacteriota", node_vector_bac_TOOL_MAGS, value = TRUE)
# match(grep("Acidobacteriota", node_vector_bac_TOOL_MAGS, value = TRUE), node_vector_bac_TOOL_MAGS)
#
# grep("Actinomycetota", node_vector_bac_TOOL_MAGS, value = TRUE)
# match(grep("Actinomycetota", node_vector_bac_TOOL_MAGS, value = TRUE), node_vector_bac_TOOL_MAGS)
#
# grep("Myxococcota", node_vector_bac_TOOL_MAGS, value = TRUE)
# match(grep("Myxococcota", node_vector_bac_TOOL_MAGS, value = TRUE), node_vector_bac_TOOL_MAGS)
#
# grep("Bacteroidota", node_vector_bac_TOOL_MAGS, value = TRUE)
# match(grep("Bacteroidota", node_vector_bac_TOOL_MAGS, value = TRUE), node_vector_bac_TOOL_MAGS)
#
# grep("Verrucomicrobiota", node_vector_bac_TOOL_MAGS, value = TRUE)
# match(grep("Verrucomicrobiota", node_vector_bac_TOOL_MAGS, value = TRUE), node_vector_bac_TOOL_MAGS)
#
# grep("Chloroflexota", node_vector_bac_TOOL_MAGS, value = TRUE)
# match(grep("Chloroflexota", node_vector_bac_TOOL_MAGS, value = TRUE), node_vector_bac_TOOL_MAGS)
#
# grep("Eremiobacterota", node_vector_bac_TOOL_MAGS, value = TRUE)
# match(grep("Eremiobacterota", node_vector_bac_TOOL_MAGS, value = TRUE), node_vector_bac_TOOL_MAGS)
#
# grep("Patescibacteria", node_vector_bac_TOOL_MAGS, value = TRUE)
# match(grep("Patescibacteria", node_vector_bac_TOOL_MAGS, value = TRUE), node_vector_bac_TOOL_MAGS)
#
# grep("Patescibacteria", node_vector_bac_TOOL_MAGS, value = TRUE)
# match(grep("Patescibacteria", node_vector_bac_TOOL_MAGS, value = TRUE), node_vector_bac_TOOL_MAGS)
# grep("Gammaproteobacteria", node_vector_bac_TOOL_MAGS, value = TRUE)
# match(grep("Gammaproteobacteria", node_vector_bac_TOOL_MAGS, value = TRUE), node_vector_bac_TOOL_MAGS)
grep("Gammaproteobacteria", node_vector_bac_TOOL_MAGS, value = TRUE)
## [1] "'1.0:c__Gammaproteobacteria'"
match(grep("Gammaproteobacteria", node_vector_bac_TOOL_MAGS, value = TRUE), node_vector_bac_TOOL_MAGS)
## [1] 259
NEON_MAGs_ind <- NEON_MAGs %>%
filter(`Assembly Type` == "Individual")
NEON_MAGs_co <- NEON_MAGs %>%
filter(`Assembly Type` == "Combined")
# Select the GTDB Taxonomic lineage and separate into taxonomic levels
sankey_data <- NEON_MAGs_co %>%
select(`GTDB-Tk Taxonomy Lineage`) %>%
# NAs are likely Archaea
replace_na(list(`GTDB-Tk Taxonomy Lineage` = 'Archaea')) %>%
# Pavian format requires p__ etc
separate(`GTDB-Tk Taxonomy Lineage`, c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"), "; ")
## Warning: Expected 7 pieces. Missing pieces filled with `NA` in 624 rows [1, 2, 3, 4, 5,
## 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
# Fill in the NAs with the taxonomic the higher taxonomic level to the left
sankey_data[] <- t(apply(sankey_data, 1, zoo::na.locf))
# Put the data into a format that can be read by the Sankey App
sankey_data <- sankey_data %>%
unite(col = "classification", c(Domain, Phylum, Class, Order, Family, Genus, Species), sep='; ') %>%
mutate_at("classification", str_replace, "Archaea", "d__Archaea") %>%
mutate_at("classification", str_replace, "Bacteria", "d__Bacteria") %>%
mutate_at("classification", str_replace, "; ", "|p__") %>%
mutate_at("classification", str_replace, "; ", "|c__") %>%
mutate_at("classification", str_replace, "; ", "|o__") %>%
mutate_at("classification", str_replace, "; ", "|f__") %>%
mutate_at("classification", str_replace, "; ", "|g__") %>%
mutate_at("classification", str_replace, "; ", "|s__")
# Create format for Pavian with counts for each taxonomic level
sankey_data_s <- sankey_data
sankey_data_g <- sankey_data
sankey_data_f <- sankey_data
sankey_data_o <- sankey_data
sankey_data_c <- sankey_data
sankey_data_p <- sankey_data
sankey_data_d <- sankey_data
sankey_data_g$classification <- sub("\\|s__.*", "", sankey_data_g$classification)
sankey_data_f$classification <- sub("\\|g__.*", "", sankey_data_f$classification)
sankey_data_o$classification <- sub("\\|f__.*", "", sankey_data_o$classification)
sankey_data_c$classification <- sub("\\|o__.*", "", sankey_data_c$classification)
sankey_data_p$classification <- sub("\\|c__.*", "", sankey_data_p$classification)
sankey_data_d$classification <- sub("\\|p__.*", "", sankey_data_d$classification)
sankey_data_allTaxa <- bind_rows(sankey_data_s, sankey_data_g, sankey_data_f, sankey_data_o, sankey_data_c, sankey_data_p, sankey_data_d) %>%
mutate(classification = as.factor(classification)) %>%
count(classification) %>%
# rename for Pavian format
rename(`#SampleID` = `classification`) %>%
rename(`Metaphlan2_Analysis` = `n`)
# Write file to input to Pavian Sankey
write_tsv(sankey_data_allTaxa, "data/NEON/NEON_MAG_co_pavian.txt")
# theme_classic(axis.text.x = element_text(color="grey20", size = 12,angle = 90, hjust = 0.5, vjust = 0.5),
# axis.text.y = element_text(color = "grey20", size = 12), text=element_text(size = 16))
John_theme <-theme(axis.text.x=element_text(angle=45, vjust=1, hjust=1, color="black", face='italic'),axis.title.x=element_text(size = 15),
axis.text.y=element_text(color="black"),axis.title.y=element_text(size = 15))
Carbon emissions from industrial activity has led to numerous changes to the global climate that threaten the ecosystems humanity depends on for industrial agriculture. Rising temperatures has caused the melting of glaciers and permafrost releasing bacterial species that have been dormant for millennium (source). Additionally, The higher green house gas atmospheric content has lead to the acidification of both ocean and ground water (source). The changing climate has also lead to species migration. With the recent passing of the 1.5 Celsius average global temperature milestone set by ORG, it is imperative that society adapt to our changing world.
In the face of climate and antibiotic challenges, plants have developed symbiotic relationships with bacteria. PLANT BACTERIA BIOCONTROL EX. PLANT NITROGEN RELATION. Thus, it has been proposed that humanity’s crops could be better insulated to ecological changes by exploiting these relationships. While several beneficial bacterial species have identified, the vast majority of the bacterial kingdom remains sequenced. Additionally, with their ability to rapidly evolve in the face of ecological challenges, new species with more robust tolerances to climate change influences will only grow with time. Thus, soil bacterium represent a vast untapped resource of climate change resistant proteins, biocontrol agents, and nitrogen fixators. Data collection effort by organizations like the National Ecological Observatory Network, provide a valuable genomic resource for phylogenetic analyses to determine the identities potential beneficial bacteria as well as monitoring the population changes caused by a changing climate.
plottttttt <-ggtree(tree_bac_node_Gammaproteobacteria, layout="circular", branch.length="none")
n=1
while (n!=29) {
if (is.na(phylumss[n,3])) {
} else {
plottttttt <-plottttttt + geom_cladelab(node=as.integer(phylumss[n,3]), label=as.character(phylumss[n,1]),size=10, align=TRUE, angle='auto', offset.text=1, textcolor=phylumss[n,4] ,barsize=1.5, fontsize=5, barcolor=as.character(phylumss[n,4]))+geom_hilight(node=as.integer(phylumss[n,3]), fill=as.character(phylumss[n,4], alpha=.6))
}
n=n+1
}
plottttttt
Figure 1: Phylogenetic Tree of all bacterial MAGs
collected in GOLD Study ID Gs0161344 by the National Ecological
Observatory Network with phylum labels.
knitr::include_url("data/lab14/sankey-NEON_MAG_ind_pavian.txt.html")
Figure 2: Sankey plot of all NEON Individual assembly MAGs.
This study’s genomic data set was collected from soil samples by the National Ecological Observatory Network (NEON) from locations across the United States in GOLD Study ID Gs0161344. There were 1754 total MAGs with PERCENT been novel species of bacteria. To make analyses more feasible, this report will only comment on two data subsets, MAGs belonging to the class Gammaproteobacteria, and MAGs belonging found at Toolik Field Station, Alaska USA.
The class Gammaproteobacteria, under the phylum Pseudomondata, is made up of around 381 genera that thrive in marine, terrestial, and eukaryotic host ecosystems (Liao et al. 2020). Historically, this class has be defined phylogenetically by 16s rRNA sequence homology (Williams and Kelly 2013). Some notable members of this class include Escherichia coli, Vibrio fischeri, and Pseudomonas aeruginosa. INSERT SOIL EXAMPLES. This class has great diversity of morphologies with rod, cocci, spirilla, and filaments all represented (Williams et al. 2010). Additionally, species in class display a variety of trophisms including chemoautotrophs and photoautotrophs (Gao, Mohan, and Gupta 2009).
Located 400 miles north from Fairbanks, Alaska at the foot of the Brooks mountain range, biodiversity at Toolik Field Station is heavily influenced by its harsh winters where temperatures can reach -50⁰F. It is home to a variety of fauna including caribou, loons, voles, and polar bears. Located above the northern tree line, the vegetation in the tundra here mainly consists of birch, willow, sedges and grass. The site contains a large range of soil conditions, including layers of permafrost, created by glacial action (NEON 2023).
This study examines the genomic content and environmental conditions of bacteria found at the Toolik Field station to help establish a reference population for future comparisons of bacterial population changes.
Microbial samples analyzed in this study were collected from soil samples taken from NEON observation sites across the United States and sequenced via high throughput Illumina sequencing. Sequence results were then processed and annotated by the DOE JGI Metagenome Workflow for its inclusion in the Integrated Microbial Genomes and Microbiomes (IGM/M) Database and Joint Genomic Institute ’s Genomes Online Database (JGI GOLD). Briefly this workflow consists of the following steps: (1) Assembly of contigs and read alignment to assembled contigs. Contigs are additionally processed for quality control. (2) Feature prediction of coding and non-coding genes, as well as CRISPR sequences. (3) Functional annotation, in which predicted features are assigned identifiers based on sequence similarity. (4) Taxonomic annotation in which contig-level phylogenetic assignments are made based on functional annotations. (5) Binning by high- and medium-quality genome bins. Bins are additionally screened for contamination. A detailed explanation of the workflow can be found in Clum et al., ASM mSystems, 2021.
The figures of this study were formatted with the following packages in R: tidyverse,knitr, ggtree, TDbook #A Companion Package for the Book “Data Integration, Manipulation and Visualization of Phylogenetic Trees” by Guangchuang Yu (2022, ISBN:9781032233574). , ggimage, rphylopic, treeio, tidytree, ape, TreeTools, phytools, ggnewscale, ggtreeExtra, ggstar, DT (GGTREE SOURCES)
ggtree(tree_bac, layout="circular", branch.length="none") +
geom_hilight(node=as.integer(phylumss[1,3]), fill=as.character(phylumss[1,4], alpha=.6)) +
geom_cladelab(node=as.integer(phylumss[1,3]), label=as.character(phylumss[1,1]),size=10, align=TRUE, angle='auto', offset.text=1, textcolor=phylumss[1,4] ,barsize=1.5, fontsize=5, barcolor=as.character(phylumss[1,4]))+
geom_hilight(node=3048, fill="steelblue", alpha=.6) +
geom_cladelab(node=3048, label="Gammaproteobacteria", align=TRUE, angle='auto', offset=1,
offset.text=0.5 , textcolor='steelblue', barcolor='steelblue',barsize=1.5, fontsize=5)
Figure 5: Phylogenetic tree of all bacterial MAGs
collected in GOLD Study ID Gs0161344 by the National Ecological
Observatory Network with the Gammaproteobacteria class of
Pseudomondata highlighted in blue.
NEON_MAGs_bact_ind %>%
ggplot(aes(x=fct_rev(fct_infreq(Phylum)), fill=`Site ID`))+geom_bar(position="dodge")+coord_flip()+labs(x="Phylum", y="NEON MAGs (n)")+theme_classic()+theme(axis.text.x=element_text(color="black"),axis.title.x=element_text(size = 15), axis.text.y=element_text(color="black",face = 'italic'),axis.title.y=element_text(size = 15), legend.title=element_text(size=15), legend.text=element_text(size=10))+ scale_y_continuous(limits=c(0,100),breaks = c(0,10,20,30,40,50,60,70,80,90,100))
Figure 6: NEON MAG Distribution by Phylum. NEON site
distribution of MAGs collected in GOLD Study ID Gs0161344 organized by
phylum.
NEON_MAGs_bact_ind_Novel %>%
ggplot(aes(x=fct_rev(fct_infreq(`Site ID`)), fill=`Site ID`))+geom_bar(show.legend=FALSE)+coord_flip()+labs(x="Site ID", y="Total Novel Bacteria (n)")+theme_classic()+theme(axis.text.x=element_text(color="black"),axis.title.x=element_text(size = 15), axis.text.y=element_text(color="black"))
Figure 7: Novel Bacteria MAG NEON Site Distribution.
Novel Bacteria were determined from MAGs constructed from individual
assemblies. Novel indicates the MAGs could not be placed in an existing
group at the species, genus or family level.
The vast majority of bacteria found in this study were determined to be novel species with PERCENT being unable to place phylogenetically at the genus level.
### Figure 8:
NEON_MAGs_bact_ind %>%
filter(is.na(Class) | is.na(Order) | is.na(Family) | is.na(Genus) ) %>%
ggplot(aes(x=fct_rev(`Phylum`), fill=`Phylum`))+geom_bar(show.legend=FALSE)+coord_flip()+labs(x="Phylum", y="Novel Genera (n)")+theme_classic()+theme(axis.text.x=element_text(color="black"),axis.title.x=element_text(size = 15), axis.text.y=element_text(color="black",face = 'italic'))+scale_y_continuous(limits = c(0,150), breaks = c(0,10,20,30,40,50,60,70,80,90,100,110,120,130,140,150))
Figure 8: Novel Bacteria Distribution by Phyla. Novel
bacteria were determined from MAGs constructed from individual
assemblies in GOLD Study ID Gs0161344. Novel indicates the MAGs could
not be placed in an existing group at the species, genus or family
level.
NEON_MAGs_metagenomes_chemistry_Bacteria <-NEON_MAGs_metagenomes_chemistry %>%
filter(Domain=="Bacteria") %>%
mutate(`Genome Size (Kbp)`=as.integer(`Total Number of Bases`/1000))
NEON_MAGs_metagenomes_chemistry_Bacteria %>%
ggplot(aes(x=`Phylum`,y=`Genome Size (Kbp)`,color=`Phylum`))+geom_boxplot(show.legend = FALSE)+scale_y_continuous(limits=c(0,15000), breaks=c(0,2500,5000,7500,10000,12500,15000))+theme_classic()+labs(title="A", x="Phylum", y="Genome Size (Kbp)")+John_theme+theme(title=element_text(size=20))+
ggplot(data=NEON_MAGs_metagenomes_chemistry_Bacteria, aes(x=`Gene Count`, y=`Genome Size (Kbp)`, color=`Phylum`))+geom_point()+labs(title="B", x="Gene Count (n)", y="")+scale_y_continuous(limits=c(0,15000), breaks=c(0,2500,5000,7500,10000,12500,15000))+scale_x_continuous(limits=c(0,15000), breaks=c(0,2500,5000,7500,10000,12500,15000))+theme_classic()+theme_classic()+theme(axis.text.x=element_text(color="black"),axis.title.x=element_text(size = 15), axis.text.y=element_text(color="black"), legend.title=element_text(size=15),legend.text=element_text(size=10),title=element_text(size=20))
Figure 9: A) Boxplot of MAG genome size (kbp) for all
bacterial phyla found in GOLD Study ID Gs0161344. B) Scatter plot of MAG
Gene count vs genome size (kbp) for all bacterial MAGs in GOLD Study ID
Gs0161344. MAGs are colored by phylum.
Terrestrial bacteria are known to have large genomes encoding thousands genes. This is due in larger part to the diverse environment they are exposed to. Their larger genomes allow for the expression of multiple metabolic phenotypes that allow them to adapt to environmental challenges. NEON samples analyzed in this study had a broad spread of genome sizes with the minimum genome and maximum genomes sizes being 753 from the phylum Chloroflexota and 12,584 kbp from the phylum Actinomycetota, respectively (Fig. 9). There was a linear relationship between gene count and genome for all NEON samples, with a rough 1,000 bp per gene ratio (Fig. 9).
knitr::include_url("data/lab14/sankey-NEON_MAG_ind_Gpro.txt.html")
Figure 10: Sankey plot of individual assembly Gammmaproteobacteria MAGs in GOLD Study ID Gs0161344.
ggtree(tree_Gammaproteobacteria, layout="circular") %<+%
NEON_MAGs_metagenomes_chemistry_Gammaproteobacteria +
geom_point2(mapping=aes(color=`Ecosystem Subtype`, size=`Total Number of Bases`))+theme( legend.title=element_text(size=15),legend.text=element_text(size=10))
## Warning: Removed 123 rows containing missing values or values outside the scale range
## (`geom_point_g_gtree()`).
Figure 11: Phylogenetic Tree of
Gammaproteobacteria with Ecosystem Subtype and Number of Bases
markers. This tree includes MAG in GOLD Study ID Gs0161344 filtered to
those annotated as belonging to the class
Gammaproteobacteria.
NEON_MAGs_metagenomes_chemistry_Gammaproteobacteria %>%
ggplot(aes(x=Family))+geom_bar(aes(fill=Family),position=position_dodge2(width=0.9, preserve="single"),show.legend=FALSE)+coord_flip()+facet_wrap(vars(Order), scales="free_y", ncol=4)+labs(x="Family", y="MAGs (n)")+theme_classic()+theme(axis.text.x=element_text(color="black"),axis.title.x=element_text(size = 15), axis.text.y=element_text(color="black",face = 'italic'), legend.title=element_text(size=15),legend.text=element_text(size=10),strip.text = element_text(size=15, face = "italic"))+scale_y_continuous(limits=c(0,75), breaks=c(0,15,30,45,60))
Figure 12: Distribution of
Gammaproteobacteria MAGs by Order. This includes MAG
reads from all NEON sites in GOLD Study ID Gs0161344 filtered to those
annotated as belonging to the class Gammaproteobacteria.
NEON_MAGs_metagenomes_chemistry_Gammaproteobacteria %>%
mutate(`Genome Size (Kbp)`=as.integer(`Total Number of Bases`/1000)) %>%
ggplot(aes(x=`Order`,y=`Genome Size (Kbp)`,color=`Order`))+geom_boxplot(show.legend = FALSE)+scale_y_continuous(limits=c(0,15000), breaks=c(2500,5000,7500,10000,12500,15000))+theme_classic()+labs(x="Order", y="Genome Size (Kbp)")+John_theme
Figure 13: Distribution of Genome Size (kbp)
for Gammaproteobacteria Members. Boxplots includes MAG
reads from all NEON sites filtered to those annotated as belonging to
the class Gammaproteobacteria
NEON MAGs assigned to the Gammaproteobacteria class were
found in all ecosystem subtypes (Fig. 10). Unlike the larger NEON data
set, the distribution of genome size of Gammaproteobacteria
members was fairly narrow with members averaging between 2000 and 5000
kbp (Fig. 12). With the exception of
Burkholderiales,Steroidobacterales, and
Xanthomonadales, this was largely due to the fewer MAGs in each
order(Fig.10). The vast majority of Gammaproteobacteria
annotated in this study were novel species with only two bacteria
belonging to the Xanthomonadales order assigned as
Stenotrophomonas stenotrophomonas sp024519465.
Steroidobacterales had by far the most annontated members,
while Burkholderiales had the most family member groups (Fig.
11).
NEON_MAGs_metagenomes_chemistry_Gammaproteobacteria %>%
ggplot(aes(y=`soilTemp`,x=`Family`, color=`Order`), size=1)+geom_point(show.legend=FALSE)+scale_y_continuous(limits=c(0,50)) +labs(title="A", y="Soil Temperature", x="")+theme_classic()+John_theme+theme(title=element_text(size=20))+
ggplot(data=NEON_MAGs_metagenomes_chemistry_Gammaproteobacteria,aes(y=`soilInWaterpH`,x=`Family`, color=`Order`))+geom_point()+scale_y_continuous(limits=c(0,14),breaks =rep(0:14) )+labs(title="B", y="pH", x="")+theme_classic()+theme(title=element_text(size=20),legend.text = element_text(size=10,face = 'italic'),legend.title = element_text(size=15))+John_theme+
ggplot(data=NEON_MAGs_metagenomes_chemistry_Gammaproteobacteria,aes(y=`nlcdClass`,x=`Family`, color=`Order`))+geom_point(show.legend=FALSE)+labs(title="C", y="Vegetation Class", x="Family")+theme_classic()+John_theme+theme(title=element_text(size=20))+
ggplot(data=NEON_MAGs_metagenomes_chemistry_Gammaproteobacteria,aes(y=`Ecosystem Subtype`,x=`Family`, color=`Order`,))+geom_point()+labs(title="D", y="Ecosystem Subtype", x="Family")+theme_classic()+theme(title=element_text(size=20), legend.title=element_text(size=15), legend.text=element_text(size=10))+John_theme
## Warning: Removed 39 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 39 rows containing missing values or values outside the scale range
## (`geom_point()`).
Figure 14: Ecological Conditions of
Gammaproteobacteria Samples. A) Scatterplot of sample soil
temperature vs Gammaproteobacteria families found in GOLD Study
ID Gs0161344. Points are colored by order group. B) Scatterplot of
sample soil pH in water vs Gammaproteobacteria families found
in GOLD Study ID Gs0161344. Points are colored by order group. C)
Scatterplot of sample National Land Cover Database Vegetation Type vs
Gammaproteobacteria families found in GOLD Study ID Gs0161344.
Points are colored by order group. D) Scatterplot of sample ecosystem
subtype vs Gammaproteobacteria families found in GOLD Study ID
Gs0161344. Points are colored by order group.
Members of Gammaproteobacteria were found in a variety of ecosystem conditions. The soil pH of all Gammaproteobacteria samples was largely neutral to mildly acidic with lowest pH around 4 (Fig. 4). Overall, the vegetation class of sedge and grassland herbaceous and contained the least amount of family groups in Gammproteobacteria (Fig. 13). This is not too surprising for sedge herbaceous as this vegetation class is only found in one Alaskan NEON site (Sup. Fig. 1). However, grassland herbaceous vegetation is present in 5 out of the 13 sample sites (Sup. Fig 2). Interestingly, despite being the Gammaproteobacteria order with the most MAGs and existing in a variety of vegetation classes, soil pH and temperatures, no members of the Steroidobacterales order were found in tropical forest or desert ecosystems sampled in this study(Fig. 13). These ecosystem subtypes correspond to NEON sites in Puerto Rico and Arizona, USA, respectively (Sup.Fig 2).
NEON_MAGs_metagenomes_chemistry_Steroidobacterales <-NEON_MAGs_metagenomes_chemistry_Steroidobacterales %>%
mutate(`Genome Size (Kbp)`=as.integer(`Total Number of Bases`/1000))
#Ecosubtype
NEON_MAGs_metagenomes_chemistry_Steroidobacterales %>%
ggplot(aes(x=Genus, y=`Ecosystem Subtype`,color=Genus))+geom_point(show.legend=FALSE)+labs(title="A", x="", y="Ecosystem Subtype")+theme_classic()+John_theme+theme(title=element_text(size=20))+
#siteID
ggplot(data=NEON_MAGs_metagenomes_chemistry_Steroidobacterales,aes(x=Genus, `Site ID.x`,color=Genus))+geom_point(show.legend=FALSE)+labs(title="B", x="", y="Site ID")+theme_classic()+John_theme+theme(title=element_text(size=20))+
#genomesize
ggplot(data=NEON_MAGs_metagenomes_chemistry_Steroidobacterales, aes(x=`Genus`,y=`Genome Size (Kbp)`,color=Genus))+geom_boxplot(show.legend = FALSE)+scale_y_continuous(limits=c(0,15000), breaks=c(2500,5000,7500,10000,12500,15000))+theme_classic()+labs(title="C", x="Genus", y="Genome Size (Kbp)")+John_theme+theme(title=element_text(size=20))+
#genus count
ggplot(data=NEON_MAGs_metagenomes_chemistry_Steroidobacterales,aes(x=Genus,fill=Genus))+geom_bar(show.legend=FALSE)+labs(title="D", x="Genus", y="MAGs (n)")+theme_classic()+scale_y_continuous(limits=c(0,50), breaks=c(0,10,20,30,40,50))+John_theme+theme(title=element_text(size=20))
Figure 15: A) Dotplot of ecosystem subtype each genus
of Steroidobacteraceae was found in. B) Dotplot of the NEON
sites each genus of Steroidobacteraceae was found in. C)
Boxplot of MAG genome sizes in Kbp for each genus of
Steroidobacteraceae. D) Total MAG counts of each genus of
Steroidobacteraceae. All samples belong to the GOLD Study ID
Gs0161344.
NEON_MAGs_metagenomes_chemistry_Bog <-NEON_MAGs_metagenomes_chemistry_Steroidobacterales %>%
filter(str_detect(Genus,"Bog-1198"))
NEON_MAGs_metagenomes_chemistry_Bog %>%
ggplot(aes(x=`Ecosystem Subtype`))+geom_bar(show.legend=FALSE,fill='black')+labs(title="A", x="Ecosystem Subtype", y=" MAGs (n)")+theme_classic()+theme(axis.text.x=element_text(angle=45, vjust=1, hjust=1, color="black"),axis.title.x=element_text(size = 15),
axis.text.y=element_text(color="black"),axis.title.y=element_text(size = 15), title=element_text(size=20))+scale_y_continuous(limits=c(0,10), breaks=c(0,1,2,3,4,5,6,7,8,9,10))+
ggplot(data=NEON_MAGs_metagenomes_chemistry_Bog, aes(x=`Site ID.x`))+geom_bar(show.legend=FALSE,fill='black')+labs(title="B", x="Site ID", y=" MAGs (n)")+theme_classic()+theme(axis.text.x=element_text(color="black"),axis.title.x=element_text(size = 15),
axis.text.y=element_text(color="black"),axis.title.y=element_text(size = 15), title=element_text(size=20))+scale_y_continuous(limits=c(0,10), breaks=c(0,1,2,3,4,5,6,7,8,9,10))
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_bar()`).
## Removed 1 row containing missing values or values outside the scale range
## (`geom_bar()`).
Figure 16: Steroidobacteraceae
BOG-1198 Ecosystem subtype and site
distribution. A) Barchart of BOG-1198 MAGs found in
each ecosystem subtype. B) Barchart of BOG-1198 MAGs found at
each Site. All samples belong to the GOLD Study ID Gs0161344.
Members in the order Steriobacterales were further examined to see if their high population is correlated with genome size or sample location. Only the Steroidobacteraceae family was found under the order. This family contain 7 genera with the genus BOG-1198 accounting for 30 of the 50 of Steriobacterales MAGs (figs. 10,14). Once again the distribution of genome size appears correlated to the total MAG counts for each genera in the Steroidobacteraceae family, with the higher populations corresponding to larger deviation in genome size. Given that BOG-1198 accounted for a majority of Steriobacterales MAGs, the genus was further examined. Members of this genus were found in several ecosystem subtypes and sample sites located in the northern United States. The majority Individual Assemblies of Steroidobacteraceae BOG-1198 MAGs were found at one of the three Alaskan sample sites (Figure 16).Note that all combined assembly MAGs were given the shrubland ecosystem subtype. A third of all BOG-1198 MAGs were coassembled.
NEON_MAGs_metagenomes_chemistry_Burkholderiales <-NEON_MAGs_metagenomes_chemistry_Burkholderiales %>%
mutate(`Genome Size (Kbp)`=as.integer(`Total Number of Bases`/1000))
NEON_MAGs_metagenomes_chemistry_Burkholderiales %>%
ggplot(aes(x=Genus, y=`Ecosystem Subtype`,color=`Family`))+geom_point(show.legend=FALSE)+labs(title="A", x="", y="Ecosystem Subtype")+theme_classic()+John_theme+theme(title=element_text(size=20))+
ggplot(data=NEON_MAGs_metagenomes_chemistry_Burkholderiales,aes(x=Genus,y= `Site ID.x`,color=`Family`))+geom_point()+labs(title="B", x="", y="Site ID")+theme_classic()+John_theme+theme(title=element_text(size=20), legend.text = element_text(size=10, face='italic'),legend.title = element_text(size=15))+
ggplot(data=NEON_MAGs_metagenomes_chemistry_Burkholderiales, aes(x=`Genus`,y=`Genome Size (Kbp)`,color=`Family`))+geom_boxplot(show.legend = FALSE)+scale_y_continuous(limits=c(0,15000), breaks=c(2500,5000,7500,10000,12500,15000))+theme_classic()+labs(title="C", x="Genus", y="Genome Size (Kbp)")+John_theme+theme(title=element_text(size=20))+
ggplot(data=NEON_MAGs_metagenomes_chemistry_Burkholderiales,aes(x=Genus,fill=`Family`))+geom_bar()+labs(title="D", x="Genus", y="MAGs (n)")+theme_classic()+John_theme+theme(title=element_text(size=20),legend.text = element_text(size=10, face='italic'),legend.title = element_text(size=15))
Figure 17:
Burkholderiales
NEON_MAGs_metagenomes_chemistry_Burkholderiaceae <- NEON_MAGs_metagenomes_chemistry_Burkholderiales %>%
filter(str_detect(Family,"Burkholderiaceae" ))
ggplot(data=NEON_MAGs_metagenomes_chemistry_Burkholderiaceae, aes(x=`Ecosystem Subtype`,fill=Genus))+geom_bar(show.legend=FALSE)+labs(title="A", x="Ecosystem Subtype", y="MAGs (n)")+theme_classic()+theme(axis.title=element_text(size = 15),title=element_text(size=20), axis.text.y=element_text(color="black"), axis.text.x=element_text(angle=45, vjust=1, hjust=1, color="black"))+
ggplot(data=NEON_MAGs_metagenomes_chemistry_Burkholderiaceae, aes(x=`Site ID.x`, fill=Genus))+geom_bar()+labs(title="B", x="Site ID", y="MAGs (n)")+theme_classic()+theme(axis.title=element_text(size = 15),axis.text.y=element_text(color="black"), axis.text.x=element_text(color="black", angle=45, vjust=1, hjust=1), title=element_text(size=20),legend.text = element_text(size=10, face='italic'),legend.title = element_text(size=15))
Figure 18: Burkholderiales
Burkholderiaceae
Members of the order Burkholderiales were also examined further at the genus level to determine if their broader diversity corresponded to the variety of ecosystems they were found in. Indeed, members of this order were found across the United States in several different ecosystem subtypes. Individually assembled members of the Burkholderiaceae including the genera Caballeronia, Herbaspirillum, and Paraburkholderi, were found mainly temperate forests with some members also found in tundra and and Boreal forest/Taiga subtypes (Fig. 16, 17). Interestingly, genera genome size distribution appears not to be correlated to the amount genera members. Despite only having 4 MAGs from the Niwot Ridge site in Colorado the Herbaspirillum genera contained a broad range of genome sizes. The Caballeronia genera, containing 5 MAGs, had a much tighter distribution of genome size (Fig. 16). Also of note is the appearance of the Trinicki genera in the Wind River Experimental Forest in Washington (Fig. 16,17). Trinicki members have prevously been found to form endosymbotic relations with the phyopathgenic fungi Rhizopus microsporus (Source).
knitr::include_url("data/lab14/sankey-NEON_MAG_Toolik.txt.html")
Figure 19: Sankey plot of Mags from Toolik Field Station
ggtree(tree_bac_TOOL_MAGs, layout="circular", branch.length="none") +
geom_hilight(node=258, fill="grey", alpha=.6) +
geom_cladelab(node=258, label="Pseudomonadota", align=TRUE, angle='auto',
offset.text=0.5 , textcolor='black', barcolor='grey',barsize=1.5, fontsize=5)+
geom_hilight(node=259, fill="steelblue", alpha=.6) +
geom_cladelab(node=259, label="Gammaproteobacteria", align=TRUE, angle='auto', offset=0.75,
offset.text=0.5 , textcolor='black', barcolor='steelblue',barsize=1.5, fontsize=5)
Figure 20: Phylogenetic tree of all MAGs collected in
GOLD Study ID Gs0161344 by the National Ecological Observatory Network
at Toolik Field Station, Alaska USA Gammaproteobacteria class
of Psedomondata highlighted in blue.
NEON_MAGs_metagenomes_chemistry_TOOL %>%
ggplot(aes(x=fct_rev(fct_infreq(Phylum)), fill=Order))+geom_bar()+coord_flip()+labs(x="Phylum", y="MAG Count (n)",fill="Order")+theme_classic()+theme(axis.title=element_text(size = 15),axis.text.y=element_text(color="black",face='italic'), axis.text.x=element_text(color="black"),legend.text = element_text(size=10, face='italic'),legend.title = element_text(size=15))
Figure 21: MAG Count by Phylum at Toolik Field
Station.
NEON_MAGs_metagenomes_chemistry %>%
ggplot(aes(y=`soilTemp`,x=`Site ID.x`, color=`Site ID.x`), size=1)+geom_point(show.legend = FALSE)+scale_y_continuous(limits=c(0,50),breaks = c(10,20,30,40,50)) +labs(title="A", y="Soil Temperature", x="Site ID")+theme_classic()+theme(title=element_text(size=15),axis.title=element_text(size = 15),axis.text.y=element_text(color="black"), axis.text.x=element_text(color="black", angle=45, vjust=1, hjust=1))+
ggplot(data=NEON_MAGs_metagenomes_chemistry,aes(y=`soilInWaterpH`,x=`Site ID.x`, color=`Site ID.x`), size=1)+geom_point(show.legend = FALSE)+scale_y_continuous(limits=c(0,14),breaks=c(1:14)) +labs(title="B", y="Soil pH", x="Site ID")+theme_classic()+theme(title=element_text(size=15),axis.title=element_text(size = 15),axis.text.y=element_text(color="black"), axis.text.x=element_text(color="black", angle=45, vjust=1, hjust=1))
## Warning: Removed 625 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 624 rows containing missing values or values outside the scale range
## (`geom_point()`).
Figure 21: NEON Site Sample Temperatures and
pH. A) Scatterplot of sample soil temperatures at each NEON
site. B) Scatterplot of sample soil pHs in water at each NEON site.
NEON_MAGs_metagenomes_chemistry_Alaska %>%
ggplot(aes(x=`Phylum`,y=`soilTemp`, color=`Phylum`), size=1)+geom_point(show.legend=FALSE)+facet_wrap(vars(Site.x), scales="free_y", ncol=3)+scale_y_continuous(limits=c(0,10)) +labs(y="Soil Temperature", x="Phylum")+theme_classic()+John_theme+theme(strip.text = element_text(size=13))
## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_point()`).
Figure 22: Soil temperatures
NEON_MAGs_metagenomes_chemistry_Alaska %>%
ggplot(aes(x=`Phylum`,y=`soilInWaterpH`, color=`Phylum`), size=1)+geom_point(show.legend = FALSE)+facet_wrap(vars(Site.x), scales="free_y", ncol=3)+scale_y_continuous(limits=c(0,14),breaks =rep(0:14)) +labs(y="Soil pH", x="Phylum")+theme_classic()+John_theme+theme(strip.text = element_text(size=13))
Figure 23: Sample soil pH in water
NEON_MAGs_metagenomes_chemistry_Alaska %>%
ggplot(aes(x=`Phylum`,y=`nlcdClass`, color=`Phylum`), size=1)+geom_point(show.legend = FALSE)+facet_wrap(vars(Site.x), scales="free_y", ncol=3)+labs(y="Vegetation Class", x="Phylum")+theme_classic()+John_theme+theme(strip.text = element_text(size=10))
Figure 24: Sample National Land Class
Vegetation Class
NEON_MAGs_metagenomes_chemistry_Alaska %>%
ggplot(aes(x=`Phylum`, fill=`Order`))+geom_bar()+facet_wrap(vars(Site.x), scales="free_y", ncol=3) +labs(y="MAGs (n)", x="Phylum")+theme_classic()+John_theme+theme(strip.text = element_text(size=10), legend.title=element_text(size=15), legend.text=element_text(size=10, face='italic'))
Figure 25: MAGs found at each Alaskan NEON
Site.
NEON_MAGs_metagenomes_chemistry_TOOL %>%
ggplot(aes(x=fct_rev(fct_infreq(Phylum)), fill=Order))+geom_bar()+coord_flip()+labs(x="Site", y="MAG Count (n)",fill="Order")+theme_classic()+theme(axis.text.x=element_text(color="black"),axis.title=element_text(size = 15), axis.text.y=element_text(color="black",face='italic'), legend.title=element_text(size=15),legend.text=element_text(size=10))+scale_y_continuous(limits=c(0,50),breaks = c(0,10,20,30,40,50))
Figure 26: MAG Count by Phylum at Toolik Field
Station
NEON_MAGs_metagenomes_chemistry_TOOL %>%
mutate(`Genome Size (Kbp)`=as.integer(`Total Number of Bases`/1000)) %>%
ggplot(aes(x=`Phylum`,y=`Genome Size (Kbp)`,color=`Phylum`))+geom_boxplot(show.legend = FALSE)+scale_y_continuous(limits=c(0,15000), breaks=c(2500,5000,7500,10000,12500,15000))+theme_classic()+labs(x="Phylum", y="Genome Size (Kbp)")+John_theme
Figure 27: Genome Size Distributions at Toolik
Field Station
ggtree(tree_bac_TOOL_MAGs) %<+%
NEON_MAGs_metagenomes_chemistry +
geom_tippoint(aes(colour=`Phylum`)) +
# For unknown reasons the following does not like blank spaces in the names
geom_facet(panel = "Gene Count (n)", data = NEON_MAGs_metagenomes_chemistry_TOOL_noblank, geom = geom_point,
mapping=aes(x = GeneCount, color=Phyla))+
geom_facet(panel = "% GC Content ", data = NEON_MAGs_metagenomes_chemistry_TOOL_noblank, geom = geom_col,
aes(x = GCassembled,fill=Phyla), orientation = 'y', width = .6, show.legend=FALSE) +
theme_tree2(legend.position=c(.1, .7),strip.text=element_text(size=15),axis.text=element_text(color = 'black'))
Figure 28: Genome content of bacteria found at
Toolik Field Station. Left Panel: Phylogenetic tree of all MAGs
found at Toolik Field Station, Alaska USA with markers for MAG phylum.
Middle Panel: Gene Count of all Mags found at Toolik Field Station,
Alaska USA. Points colored by MAG phylum. Right Panel: % GC Content of
all MAGs found at Toolik Field Station, Alaska USA. Bar colored by MAG
phylum.
NEON_MAGs_metagenomes_chemistry_TOOL_Steroidobacterales %>%
ggplot( aes(x=`Genus`,y=`Genome Size (Kbp)`,color=Genus))+geom_boxplot(show.legend = FALSE)+scale_y_continuous(limits=c(0,15000), breaks=c(2500,5000,7500,10000,12500,15000))+theme_classic()+labs(title="A", x="Genus", y="Genome Size (Kbp)")+John_theme+theme(title = element_text(size=15))+
#genus count
ggplot(data=NEON_MAGs_metagenomes_chemistry_TOOL_Steroidobacterales,aes(x=Genus,fill=Genus))+geom_bar(show.legend=FALSE)+labs(title="B", x="Genus", y="MAGs (n)")+theme_classic()+John_theme+theme(title = element_text(size=15))
Figure 29: Steroidobacterales
A) Boxplot of sample genome size (kbp) in the
Steroidobacterales family at Toolik Field Station,
Alaska USA. B) Barplot of MAGs for each genus in the
Steroidobacterales family found at Toolik Field
Station, Alaska USA.
NEON_MAGs_metagenomes_chemistry_TOOL_Burkholderiales %>%
ggplot(aes(x=Genus,fill=`Family`))+geom_bar()+labs(x="Genus", y="MAGs (n)")+theme_classic()+John_theme+theme(legend.title = element_text(size=15),legend.text = element_text(size=10, face='italic'))+scale_y_continuous(limits=c(0,5),breaks=rep(0:5))
Figure 30: Genus distribution of
Burkholderiales is wide spread at
Toolik Field Station, Alaska USA. Barplot of MAGs in
Burkholderiales Genera at Toolik Field Station, Alaska
USA. NA represent novel genera in Burkholderiales
Several notable genera in Burkholderiales were found in this study, Caballeronia, paraburkholderia (known nitrogen fixators and endosymbotes), trinickia Novel associated with phytopathogen from journal club
NEON_MAGs_metagenomes_chemistry %>%
ggplot(aes(y=`Site.x`, x=nlcdClass))+geom_point()+labs(title="NEON Site Vegetation Classes", y="Site", x="Vegetation Class")+theme_classic()+theme(axis.text.x=element_text(angle=45, vjust=1, hjust=1))
NEON_MAGs_metagenomes_chemistry %>%
ggplot(aes(y=`Site.x`, x=`Ecosystem Subtype`))+geom_point()+labs(title="NEON Site Ecosystem Subtypes", y="Site", x="Ecosystem Subtype")+theme_classic()+theme(axis.text.x=element_text(angle=45, vjust=1, hjust=1))
### Figure 2: Sankey plot of all NEON Combined assembly MAGs
knitr::include_url("data/lab14/sankey-NEON_MAG_co_pavian.txt.html")
Figure 2: Sankey plot of all NEON combined assembly MAGs
knitr::include_url("data/lab14/sankey-NEON_MAG_co_Gpro.txt.html")
Figure 2: Sankey plot of all NEON combined assembly Gammmaproteobacteria MAGs
ggtree(tree_Gammaproteobacteria) %<+%
NEON_MAGs_metagenomes_chemistry_Gammaproteobacteria +
geom_tippoint(aes(color=`Order`)) +
# For unknown reasons the following does not like blank spaces in the names
geom_facet(panel = "Gene Count", data = NEON_MAGs_metagenomes_chemistry_Gamma_noblank, geom = geom_point,
mapping=aes(x = GeneCount, color=Orders))+
geom_facet(panel = "% GC Content ", data = NEON_MAGs_metagenomes_chemistry_Gamma_noblank, geom = geom_col,
aes(x = GCassembled,fill=Orders), orientation = 'y', width = .6, show.legend=FALSE) +
theme_tree2(legend.position=c(.1, .7))
ggtree(tree_Gammaproteobacteria) %<+%
NEON_MAGs_metagenomes_chemistry +
xlim(0,20)+
geom_point(mapping=aes(color=`Site.x`))
Figure 5: Phylogenetic tree of all MAGs belonging to the class Gammaproteobacteria with Gene count and %GC Content.
redo this because points are overlapping
GGtree Yu G (2022). Data Integration, Manipulation and Visualization of Phylogenetic Treess, 1st edition edition. Chapman and Hall/CRC. doi:10.1201/9781003279242, https://www.amazon.com/Integration-Manipulation-Visualization-Phylogenetic-Computational-ebook/dp/B0B5NLZR1Z/.
Xu S, Li L, Luo X, Chen M, Tang W, Zhan L, Dai Z, Tommy T. Lam, Guan Y, Yu G (2022). “Ggtree: A serialized data object for visualization of a phylogenetic tree and annotation data.” iMeta, 1(4), e56. doi:10.1002/imt2.56, https://onlinelibrary.wiley.com/doi/full/10.1002/imt2.56.
Yu G (2020). “Using ggtree to Visualize Data on Tree-Like Structures.” Current Protocols in Bioinformatics, 69(1), e96. doi:10.1002/cpbi.96, https://currentprotocols.onlinelibrary.wiley.com/doi/abs/10.1002/cpbi.96.
Yu G, Lam TT, Zhu H, Guan Y (2018). “Two methods for mapping and visualizing associated data on phylogeny using ggtree.” Molecular Biology and Evolution, 35, 3041-3043. doi:10.1093/molbev/msy194, https://academic.oup.com/mbe/article/35/12/3041/5142656.
Yu G, Smith D, Zhu H, Guan Y, Lam TT (2017). “ggtree: an R package for visualization and annotation of phylogenetic trees with their covariates and other associated data.” Methods in Ecology and Evolution, 8, 28-36. doi:10.1111/2041-210X.12628, http://onlinelibrary.wiley.com/doi/10.1111/2041-210X.12628/abstract.
gold Please cite: Supratim Mukherjee, Dimitri Stamatis, Cindy Tianqing Li, Galina Ovchinnikova, Jon Bertsch, Jagadish Chandrabose Sundaramurthi, Mahathi Kandimalla, Paul A. Nicolopoulos, Alessandro Favognano, I-Min A. Chen , Nikos C. Kyrpides and T.B.K. Reddy. Twenty-five years of Genomes OnLine Database (GOLD): data updates and new features in v.9. Nucl. Acids Res. (2022) doi: doi.org/10.1093/nar/gkac974
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8269246/